packages <- c(
"sp.scRNAseqData",
"seqTools",
"printr",
"ggthemes",
"tidyverse"
)
purrr::walk(packages, library, character.only = TRUE)
rm(packages)
#' Reorder an x or y axis within facets
#'
#' Reorder a column before plotting with faceting, such that the values are ordered
#' within each facet. This requires two functions: \code{reorder_within} applied to
#' the column, then either \code{scale_x_reordered} or \code{scale_y_reordered} added
#' to the plot.
#' This is implemented as a bit of a hack: it appends ___ and then the facet
#' at the end of each string.
#'
#' @param x Vector to reorder.
#' @param by Vector of the same length, to use for reordering.
#' @param within Vector of the same length that will later be used for faceting
#' @param fun Function to perform within each subset to determine the resulting
#' ordering. By default, mean.
#' @param sep Separator to distinguish the two. You may want to set this manually
#' if ___ can exist within one of your labels.
#' @param ... In \code{reorder_within} arguments passed on to \code{\link{reorder}}.
#' In the scale functions, extra arguments passed on to
#' \code{\link[ggplot2]{scale_x_discrete}} or \code{\link[ggplot2]{scale_y_discrete}}.
#'
#' @source "Ordering categories within ggplot2 Facets" by Tyler Rinker:
#' \url{https://trinkerrstuff.wordpress.com/2016/12/23/ordering-categories-within-ggplot2-facets/}
#'
#' @examples
#'
#' library(tidyr)
#' library(ggplot2)
#'
#' iris_gathered <- gather(iris, metric, value, -Species)
#'
#' # reordering doesn't work within each facet (see Sepal.Width):
#' ggplot(iris_gathered, aes(reorder(Species, value), value)) +
#' geom_boxplot() +
#' facet_wrap(~ metric)
#'
#' # reorder_within and scale_x_reordered work.
#' # (Note that you need to set scales = "free_x" in the facet)
#' ggplot(iris_gathered, aes(reorder_within(Species, value, metric), value)) +
#' geom_boxplot() +
#' scale_x_reordered() +
#' facet_wrap(~ metric, scales = "free_x")
#'
#' @export
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
new_x <- paste(x, within, sep = sep)
stats::reorder(new_x, by, FUN = fun)
}
#' @rdname reorder_within
#' @export
scale_x_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}
#' @rdname reorder_within
#' @export
scale_y_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_y_discrete(labels = function(x) gsub(reg, "", x), ...)
}
norm <- function(counts) {
if(all(class(counts) == c("tbl_df", "tbl", "data.frame"))) {
mat <- counts %>%
as.data.frame() %>%
column_to_rownames("gene") %>%
as.matrix()
t(t(mat) / colSums(mat) * 10^6 + 1) %>%
matrix_to_tibble("gene")
} else if(class(counts) == "matrix") {
t(t(counts) / colSums(counts) * 10^6 + 1)
}
}
#generates one multiplet per combo
#combos: a matrix the combinations of cell types to be included in the
# multiplets. Typically generated with combn.
#adjustment: a numeric vector with length = the number of unique cell types
#indicating the fraction of contribution for each cell type. Names should
#indicate the corresponding cell type.
#seed: seed for random number initiation. When generating many multiplets by
# running the function multiple times in a loop, the seed should be set
# dynamically in the function calling the makeSyntheticData function to avoid
# all multiplets being identical.
makeSyntheticData <- function(
counts,
classes,
combos,
adjustment,
noise = FALSE,
seed = 87909023
){
set.seed(seed)
sums <- t(rowsum(t(counts), classes, reorder = FALSE))
output <- data.frame(gene = rownames(counts), stringsAsFactors = FALSE)
combos %>%
as.data.frame(stringsAsFactors = FALSE) %>%
map(., function(x) {
adj <- adjustment[match(x, names(adjustment))]
adjusted <- t(t(sums[, colnames(sums) %in% x]) * adj)
rs <- rowSums(adjusted)
if(noise) rs <- noise(rs)
expanded <- rep(names(rs), rs)
sample(
x = expanded, replace = FALSE,
size = length(expanded) / sum(classes %in% x)
) %>%
table() %>%
as.data.frame(., row.names = names(.), stringsAsFactors = FALSE) %>%
setNames(c("gene", paste(sort(x), collapse = "-")))
}) %>%
reduce(full_join, by = "gene") %>%
full_join(output, by = "gene") %>%
replace(is.na(.), 0) %>%
as_tibble()
}
noise <- function(rs) {
rs <- rs * runif(length(rs), 0.6, 1.4)
rs <- rs + rbinom(length(rs), size = 400, prob = 0.5) - 400 / 2
rs[rs < 0] <- 0
round(rs)
}
makeSyntheticData2 <- function(
counts,
classes,
combos,
adjustment,
noise = FALSE,
seed = 87909023
){
set.seed(seed)
#adjust counts by adjustment vector
adj <- rep(adjustment, length(classes) / length(adjustment))
adjusted <- t(t(counts) * adj[match(classes, names(adj))])
#calculate number of dropouts and non-dropouts per class per gene
dos <- countDropOuts(counts, classes, FUN = function(y) {length(which(y == 0))})
ndos <- countDropOuts(counts, classes, FUN = function(y) {length(which(y != 0))})
#calculate mean and sd gene expression per gene per class
ms <- means(adjusted, classes)
ss <- sds(adjusted, classes)
#setup output structure
output <- data.frame(gene = rownames(counts), stringsAsFactors = FALSE)
#generate synthetic multiplet
combos %>%
as.data.frame(stringsAsFactors = FALSE) %>%
map(., function(x) {
map_dbl(1:nrow(counts), function(y) {
if(!is.nan(ms[y, x[1]]) & !is.na(ss[y, x[1]])){
n1 <- 2^rnorm(ndos[y, x[1]], ms[y, x[1]], ss[y, x[1]])
} else {
n1 <- NA
}
if(!is.nan(ms[y, x[2]]) & !is.na(ss[y, x[2]])) {
n2 <- 2^rnorm(ndos[y, x[2]], ms[y, x[2]], ss[y, x[2]])
} else {
n2 <- NA
}
dos1 <- rep(0, dos[y, x[1]])
dos2 <- rep(0, dos[y, x[2]])
s1 <- sample(c(n1, dos1)[!is.na(c(n1, dos1))], 1)
s2 <- sample(c(n2, dos2)[!is.na(c(n2, dos2))], 1)
if_else(mean(s1, s2) < 0, 0, mean(s1, s2))
}) %>%
data.frame(gene = rownames(counts), ., stringsAsFactors = FALSE) %>%
setNames(c("gene", paste(sort(x), collapse = "-")))
}) %>%
reduce(full_join, by = "gene") %>%
full_join(output, by = "gene") %>%
replace(is.na(.), 0) %>%
as_tibble()
}
countDropOuts <- function(counts, classes, FUN = function(y) {length(which(y == 0))}) {
counts %>%
t() %>%
split(classes) %>%
map(function(x) {
m <- matrix(x, nrow = nrow(counts), byrow = TRUE)
dos <- apply(m, 1, FUN)
tibble(gene = rownames(counts), dos)
}) %>%
reduce(full_join, by = "gene") %>%
as.data.frame() %>%
column_to_rownames("gene") %>%
setNames(sort(unique(classes)))
}
means <- function(counts, classes) {
counts %>%
t() %>%
split(classes) %>%
map(function(x) {
m <- matrix(x, nrow = nrow(counts), byrow = TRUE)
means <- apply(m, 1, function(z) mean(z[z != 0]))
tibble(gene = rownames(counts), means)
}) %>%
reduce(full_join, by = "gene") %>%
as.data.frame() %>%
column_to_rownames("gene") %>%
setNames(sort(unique(classes)))
}
sds <- function(counts, classes) {
counts %>%
t() %>%
split(classes) %>%
map(function(x) {
m <- matrix(x, nrow = nrow(counts), byrow = TRUE)
sds <- apply(m, 1, function(z) sd(z[z != 0]))
tibble(gene = rownames(counts), sds)
}) %>%
reduce(full_join, by = "gene") %>%
as.data.frame() %>%
column_to_rownames("gene") %>%
setNames(sort(unique(classes)))
}
The synthetic multiplets are based on the mixing of real synthetic singlets. The algorithm for generating the synthetic multiplets works as follows: The sum of raw counts is calculated for each gene in all singlets from the cell types contributing to the multiplet. “Noise” can then optionally be added and this is recommended due to the fact that it prevents the generated multiplets from following the mean of the singlets too closley. A vector is generated where each gene name is represented one time for each of the total counts (e.g. if gene “A” has a sum of counts from all singlets from the cell types contributing to the multiplet of 10, “A” will be repreated 10 times in the vector). This vector is then sampled without replacment. It is sampled x times, where x is equal to the sum of all the counts in all singlets from the cell types contributing to the multiplet divided by the total number of singlets from the cell types contributing to the multiplet. Gene names are then counted which provides the raw counts for the newly synthesized multiplet.
Here we utilize the sorted cell line dataset to generate multiplets from the singlets. The synthesized multiplets are then compared to the real multiplets to acertain the quality of the in silico synthesis.
We begin by synthesizing one multiplet per cell line combination, i.e. a A375-HCT116, A375-HOS, and HCT116-HOS multiplet. Initially we do this without the addition of noise so that the results from “with noise” and “without noise” can be compared.
s <- str_detect(colnames(countsSorted2), "^s")
sng <- countsSorted2[, s]
classes <- slice(countsSortedMeta2, match(colnames(sng), countsSortedMeta2$sample))$cellTypes
combos <- apply(combn(unique(classes), 2), 2, sort)
adjustment <- rep(1, length(unique(classes)))
names(adjustment) <- unique(classes)
synth <- sng %>%
norm() %>%
log2() %>%
makeSyntheticData2(., classes, combos, adjustment) %>%
norm()
mul <- countsSorted2[, !s] %>%
norm() %>%
matrix_to_tibble("gene") %>%
gather(sample, counts, -gene) %>%
left_join(countsSortedMeta2, by = "sample") %>%
select(gene, counts, cellTypes, sample)
#randomly select some genes to facilitate plotting
genes <- sample(rownames(sng), 500, replace = FALSE)
concatDat <- gather(synth, cellTypes, synCounts, -gene) %>%
inner_join(., mul, by = c("gene", "cellTypes")) %>%
filter(gene %in% genes)
Examine drop-out rate in the real data compared to the synthetic data.
print("real: ")
## [1] "real: "
countsSorted2[, !s] %>%
norm() %>%
matrix_to_tibble("gene") %>%
select(filter(countsSortedMeta2, cellTypes %in% colnames(synth))$sample) %>%
as.matrix() %>%
apply(., 2, function(x) length(which(x == 1))) %>%
`/` (nrow(countsSorted2)) %>%
mean()
## [1] 0.4850146
print("synthetic: ")
## [1] "synthetic: "
synth %>%
select(-gene) %>%
as.matrix() %>%
apply(., 2, function(x) length(which(x == 1))) %>%
`/` (nrow(countsSorted2)) %>%
mean()
## [1] 0.5982248
Plot the results. The plot below contains…
concatDat %>%
ggplot() +
geom_point(aes(reorder_within(gene, counts, cellTypes, fun = "mean"), counts), size = 0.1) +
geom_line(
data = . %>% select(gene, cellTypes, synCounts) %>% distinct(),
aes(reorder_within(gene, synCounts, cellTypes, fun = "mean"), synCounts, group = cellTypes),
colour = "red"
) +
facet_wrap(~cellTypes) +
scale_x_reordered() +
theme_few() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
) +
labs(x = "Gene", y = "cpm")
concatDat %>%
ggplot() +
geom_point(aes(reorder_within(gene, log2(counts), cellTypes, fun = "mean"), log2(counts)), size = 0.1) +
geom_point(
data = . %>% select(gene, cellTypes, synCounts) %>% distinct(),
aes(reorder_within(gene, log2(synCounts), cellTypes, fun = "mean"), log2(synCounts)),
colour = "red", size = 0.3
) +
facet_wrap(~cellTypes) +
scale_x_reordered() +
theme_few() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
) +
labs(x = "Gene", y = "log2(cpm)")
genes <- sample(rownames(sng), 50, replace = FALSE)
concatDat <- gather(synth, cellTypes, synCounts, -gene) %>%
inner_join(., mul, by = c("gene", "cellTypes")) %>%
filter(gene %in% genes)
concatDat %>%
ggplot() +
geom_violin(aes(reorder_within(gene, log2(counts), cellTypes, fun = "mean"), log2(counts)), size = 0.1) +
geom_point(aes(reorder_within(gene, log2(counts), cellTypes, fun = "mean"), log2(counts)), size = 0.1) +
geom_point(
data = . %>% select(gene, cellTypes, synCounts) %>% distinct(),
aes(reorder_within(gene, log2(synCounts), cellTypes, fun = "mean"), log2(synCounts)),
colour = "red", size = 0.3
) +
facet_wrap(~cellTypes) +
scale_x_reordered() +
theme_few() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
) +
labs(x = "Gene", y = "log2(cpm)")
n <- 10
names <- paste(
apply(combos, 2, paste, collapse = "-"),
rep(1:n, each = ncol(combos)),
sep = "."
)
synth <- sng %>%
norm() %>%
log2() %>%
{map(1:n, function(x) makeSyntheticData2(., classes, combos, adjustment, seed = x + 9789))} %>%
reduce(full_join, by = "gene") %>%
setNames(c("gene", names)) %>%
norm()
genes <- sample(rownames(sng), 500, replace = FALSE)
concatDat <- synth %>%
filter(gene %in% genes) %>%
gather(typeRep, synCounts, -gene) %>%
separate(typeRep, c("cellTypes", "sample"), sep = "\\.") %>%
inner_join(., mul, by = c("gene", "cellTypes"))
Examine drop-out rate
print("real: ")
## [1] "real: "
countsSorted2[, !s] %>%
norm() %>%
matrix_to_tibble("gene") %>%
select(filter(countsSortedMeta2, cellTypes %in% concatDat$cellTypes)$sample) %>%
as.matrix() %>%
apply(., 2, function(x) length(which(x == 1))) %>%
`/` (nrow(countsSorted2)) %>%
mean()
## [1] 0.4850146
print("synthetic: ")
## [1] "synthetic: "
synth %>%
select(-gene) %>%
as.matrix() %>%
apply(., 2, function(x) length(which(x == 1))) %>%
`/` (nrow(countsSorted2)) %>%
mean()
## [1] 0.5975402
concatDat %>%
ggplot() +
geom_point(
data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
aes(reorder_within(gene, counts, cellTypes, fun = "mean"), counts), size = 0.1
) +
geom_line(
data = . %>% select(gene, cellTypes, synCounts, sample.x) %>% distinct(),
aes(reorder_within(gene, synCounts, cellTypes, fun = "mean"), synCounts, group = cellTypes, colour = sample.x)
) +
facet_wrap(~cellTypes, scales = "free") +
theme_few() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
) +
labs(x = "Gene", y = "cpm") +
guides(colour = FALSE)
concatDat %>%
ggplot() +
geom_point(
data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
aes(reorder_within(gene, log2(counts), cellTypes, fun = "mean"), log2(counts)), size = 0.1
) +
geom_point(
data = . %>% select(gene, cellTypes, synCounts, sample.x) %>% distinct(),
aes(reorder_within(gene, log2(synCounts), cellTypes, fun = "mean"), log2(synCounts), colour = sample.x),
size = 0.3
) +
facet_wrap(~cellTypes, scales = "free") +
theme_few() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
) +
labs(x = "Gene", y = "log2(cpm)") +
guides(colour = FALSE)
genes <- sample(rownames(sng), 50, replace = FALSE)
concatDat <- synth %>%
filter(gene %in% genes) %>%
gather(typeRep, synCounts, -gene) %>%
separate(typeRep, c("cellTypes", "sample"), sep = "\\.") %>%
inner_join(., mul, by = c("gene", "cellTypes"))
concatDat %>%
filter(cellTypes == "HCT116-HOS") %>%
ggplot() +
geom_violin(
data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
aes(reorder_within(gene, log2(counts), cellTypes, fun = "mean"), log2(counts)), size = 0.1
) +
geom_point(
data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
aes(reorder_within(gene, log2(counts), cellTypes, fun = "mean"), log2(counts)), size = 0.1
) +
geom_point(
data = . %>% select(gene, cellTypes, synCounts, sample.x) %>% distinct(),
aes(reorder_within(gene, log2(synCounts), cellTypes, fun = "mean"), log2(synCounts), colour = sample.x),
size = 0.3
) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, size = 3)) +
labs(x = "Gene", y = "log2(cpm)", caption = "HCT116-HOS") +
guides(colour = FALSE)
concatDat %>%
filter(cellTypes == "A375-HCT116") %>%
ggplot() +
geom_violin(
data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
aes(reorder_within(gene, log2(counts), cellTypes, fun = "mean"), log2(counts)), size = 0.1
) +
geom_point(
data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
aes(reorder_within(gene, log2(counts), cellTypes, fun = "mean"), log2(counts)), size = 0.1
) +
geom_point(
data = . %>% select(gene, cellTypes, synCounts, sample.x) %>% distinct(),
aes(reorder_within(gene, log2(synCounts), cellTypes, fun = "mean"), log2(synCounts), colour = sample.x),
size = 0.3
) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, size = 3)) +
labs(x = "Gene", y = "log2(cpm)", caption = "A375-HCT116") +
guides(colour = FALSE)
concatDat %>%
filter(cellTypes == "A375-HOS") %>%
ggplot() +
geom_violin(
data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
aes(reorder_within(gene, log2(counts), cellTypes, fun = "mean"), log2(counts)), size = 0.1
) +
geom_point(
data = . %>% select(gene, cellTypes, counts, sample.y) %>% distinct(),
aes(reorder_within(gene, log2(counts), cellTypes, fun = "mean"), log2(counts)), size = 0.1
) +
geom_point(
data = . %>% select(gene, cellTypes, synCounts, sample.x) %>% distinct(),
aes(reorder_within(gene, log2(synCounts), cellTypes, fun = "mean"), log2(synCounts), colour = sample.x),
size = 0.3
) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, size = 3)) +
labs(x = "Gene", y = "log2(cpm)", caption = "A375-HOS") +
guides(colour = FALSE)
#tsne
set.seed(2343)
countsSorted2[, colnames(countsSorted2) %in% filter(countsSortedMeta2, cellTypes %in% concatDat$cellTypes)$sample] %>%
norm() %>%
matrix_to_tibble("gene") %>%
full_join(synth, by = "gene") %>%
as.data.frame() %>%
column_to_rownames("gene") %>%
as.matrix() %>%
`+` (1) %>%
log2(.) %>%
pearsonsCor(., select = nTopMax(., 2000)) %>%
Rtsne::Rtsne(., is_distance = TRUE, perplexity = 15) %>%
`[[` ('Y') %>%
matrix_to_tibble() %>%
add_column(sample = c(colnames(synth[, -1]), colnames(countsSorted2)[colnames(countsSorted2) %in% filter(countsSortedMeta2, cellTypes %in% concatDat$cellTypes)$sample])) %>%
mutate(type = if_else(str_detect(sample, "^m"), "real", "synthetic")) %>%
ggplot() +
geom_point(aes(V1, V2, colour = type))
rmReal <- countsSortedMeta2 %>%
filter(cellTypes %in% concatDat$cellTypes) %>%
pull(sample) %>%
match(., colnames(countsSorted2)) %>%
countsSorted2[, .] %>%
norm() %>%
as.data.frame() %>%
rownames_to_column("gene") %>%
gather(sample, value, -gene) %>%
as_tibble() %>%
inner_join(countsSortedMeta2, by = "sample") %>%
group_by(cellTypes, gene) %>%
summarize(meanReal = mean(log2(value))) %>%
ungroup()
rmSynth <- synth %>%
gather(sample, value, -gene) %>%
separate(sample, c("cellTypes", "sample"), sep = "\\.") %>%
group_by(cellTypes, gene) %>%
summarize(meanSynth = mean(log2(value))) %>%
ungroup()
rms <- full_join(rmReal, rmSynth, by = c("cellTypes", "gene"))
rms %>%
ggplot() +
geom_point(aes(meanReal, meanSynth), alpha = 1/10) +
facet_wrap(~cellTypes) +
labs(x = "Mean log2(cpm) real multiplets", y = "Mean log2(cpm) synthetic multiplets") +
theme_bw()
adjustment[names(adjustment) == "HOS"] <- 0.75
synth7 <- makeSyntheticData(sng, classes, combos, adjustment) %>%
rownames_to_column("gene") %>%
as_tibble() %>%
gather(cellTypes, synCounts7, -gene)
adjustment[names(adjustment) == "HOS"] <- 0.5
synth5 <- makeSyntheticData(sng, classes, combos, adjustment) %>%
rownames_to_column("gene") %>%
as_tibble() %>%
gather(cellTypes, synCounts5, -gene)
adjustment[names(adjustment) == "HOS"] <- 0.25
synth25 <- makeSyntheticData(sng, classes, combos, adjustment) %>%
rownames_to_column("gene") %>%
as_tibble() %>%
gather(cellTypes, synCounts25, -gene)
adjustment[names(adjustment) == "HOS"] <- 0.01
synth01 <- makeSyntheticData(sng, classes, combos, adjustment) %>%
rownames_to_column("gene") %>%
as_tibble() %>%
gather(cellTypes, synCounts01, -gene)
concatDat <- full_join(synth, mul, by = c("gene", "cellTypes")) %>%
full_join(synth7, by = c("gene", "cellTypes")) %>%
full_join(synth5, by = c("gene", "cellTypes")) %>%
full_join(synth25, by = c("gene", "cellTypes")) %>%
full_join(synth01, by = c("gene", "cellTypes")) %>%
filter(gene %in% genes) %>%
gather(synType, synCounts, -gene, -cellTypes, -counts, -sample) %>%
mutate(synType = case_when(
synType == "synCounts" ~ "100%",
synType == "synCounts7" ~ "75%",
synType == "synCounts5" ~ "50%",
synType == "synCounts25" ~ "25%",
synType == "synCounts01" ~ "1%"
)) %>%
mutate(synType = parse_factor(synType, levels = c("100%", "75%", "50%", "25%", "1%")))
ggplot() +
geom_smooth(data = concatDat, aes(counts, synCounts, colour = synType), method = 'gam') +
facet_wrap(~cellTypes, scales = "free") +
theme_few() +
theme(legend.position = "top") +
labs(x = "Sorted multiplet counts", y = "Synthetic multiplet counts") +
guides(colour = guide_legend(title = "HOS contribution to multiplet", title.position = "top", title.hjust = 0.5))